Questions: 1) What is the best possible model to predict a person who would churn or not? 2) What are the most significant variables at the model?
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
data= pd.read_csv("BankChurners.csv")
data.head()
df=data.copy()
print(f"there are {df.shape[0]} rows and {df.shape[1]} columns")
df.isnull().sum() #there are missing values on the Education_Level and Marital_Status columns
df.duplicated().sum() #there is no duplicated rows on the data set
df.dtypes
df.describe(include="all").T
df=df.drop(["CLIENTNUM"], axis=1)
list_str = list(df.select_dtypes(include=['object']).columns)
list_str
cat_list=["Attrition_Flag","Gender","Education_Level","Marital_Status","Income_Category","Card_Category","Dependent_count","Total_Relationship_Count","Months_Inactive_12_mon","Contacts_Count_12_mon"]
for column in cat_list:
df[column]=df[column].astype("category")
df.dtypes # recheck the data types
for column in cat_list:
print(df[column].value_counts())
print("#"*50)
def labeled_barplot(data, feature, percentage=False, n=None):
total = len(data[feature])
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 6))
else:
plt.figure(figsize=(n + 1, 6))
plt.xticks(rotation=45, fontsize=10)
ax = sns.countplot(
data=data,
x=feature,
palette="Set2",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if percentage == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
)
else:
label = p.get_height()
x = p.get_x() + p.get_width() / 2
y = p.get_height()
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
)
plt.show()
for feature in cat_list:
print(labeled_barplot(df, feature, percentage=True))
print("#"*80)
def boxplot_hitogram(data, feature, figsize=(10, 5), kde=True, bins=20):
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2,
sharex=True,
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
)
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="yellowgreen"
)
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="pink"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
)
ax_hist2.axvline(
data[feature].mean(), color="lime", linestyle="-"
)
ax_hist2.axvline(
data[feature].median(), color="red", linestyle="--"
)
num_list= list(df.select_dtypes(include=['float64',"int64"]).columns)
num_list
df.describe().T
boxplot_hitogram(df, "Customer_Age", figsize=(10, 5), kde=True, bins=50)
Customer ages ranged from 26 to 73, and the mean value is 46.3 and normally distributed. Mean and median scores are almost the same. There are outliers.
boxplot_hitogram(df, "Months_on_book", figsize=(10, 5), kde=True, bins=20)
The mean and median scores of the Months_on_book variable are the same. There are outliers.
boxplot_hitogram(df, "Credit_Limit", figsize=(10, 5), kde=True, bins=50)
Credit_Limit is right skewed. There are outliers.Mean score is 8631.9
boxplot_hitogram(df, "Total_Revolving_Bal", figsize=(10, 5), kde=True, bins=50)
Total_Revolving_Bal is right skewed. Box plot shows that there is no outliers.
boxplot_hitogram(df, "Avg_Open_To_Buy", figsize=(10, 5), kde=True, bins=50)
Avg_Open_To_Buy is skewed right and there are outliers.
boxplot_hitogram(df, "Total_Amt_Chng_Q4_Q1", figsize=(10, 5), kde=True, bins=50)
The mean of the Total_Amt_Chng_Q4_Q1 variable is 7469.1 and the mean and median scores are almost same. There are many outliers.
boxplot_hitogram(df, "Total_Trans_Ct", figsize=(10, 5), kde=True, bins=50)
Total_Trans_Ct is not normally distributed; there are two peaks. There are outliers.
boxplot_hitogram(df, "Total_Trans_Amt", figsize=(10, 5), kde=True, bins=50)
Total_Trans_Amt is not normally distributed; there are peaks; It can be categorical. There are outliers.
boxplot_hitogram(df, "Total_Ct_Chng_Q4_Q1", figsize=(10, 5), kde=True, bins=50)
Total_Ct_Chng_Q4_Q1 has outliers. It ranged from 0 to 3.714 and has 0.274 mean value.
boxplot_hitogram(df, "Avg_Utilization_Ratio", figsize=(10, 5), kde=True, bins=50)
Avg_Utilization_Ratio is skwed right and has 0.274 mean value and ranged from 0 to 0.999.
sns.pairplot(df, hue="Attrition_Flag")
plt.figure(figsize=(15,8)) #There is a very high correlation between credit_Limit and Avg_Open_To_Buy, so one of them should be dropped.
sns.heatmap(df.corr(), annot=True)
def stc_barplot(data, predictor, target):
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("*" * 100)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left",
frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stc_barplot(df, "Education_Level", "Attrition_Flag")
stc_barplot(df, "Gender", "Attrition_Flag")
stc_barplot(df, "Marital_Status", "Attrition_Flag")
stc_barplot(df, "Income_Category", "Attrition_Flag")
stc_barplot(df, "Card_Category", "Attrition_Flag")
stc_barplot(df, "Dependent_count", "Attrition_Flag")
stc_barplot(df, "Total_Relationship_Count", "Attrition_Flag")
stc_barplot(df, "Months_Inactive_12_mon", "Attrition_Flag")
stc_barplot(df, "Contacts_Count_12_mon", "Attrition_Flag")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Customer_Age", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Months_on_book", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Credit_Limit", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Total_Revolving_Bal", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Avg_Open_To_Buy", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Total_Amt_Chng_Q4_Q1", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Total_Trans_Amt", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Total_Trans_Ct", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Total_Ct_Chng_Q4_Q1", data=df, orient="vertical")
plt.figure(figsize=(10,5))
sns.boxplot(x="Attrition_Flag", y="Avg_Utilization_Ratio", data=df, orient="vertical")
The ages of the attrited and existing customers have similar median scores. Attrited and existing customers' median scores of months on book are similar. Existing customers' total revolving balance median score is higher than the attrited customers' total revolving balance median score.
df.isnull().sum()
df[df["Education_Level"].isnull()]
df[df["Income_Category"]=="abc"]
df.groupby(by=["Income_Category"])["Gender"].value_counts()
df["Income_Category"].replace({"abc":np.nan}, inplace=True)
df["Income_Category"].isnull().sum()
df[df["Marital_Status"].isnull()]
df["Marital_Status"].value_counts()
There is no trend between the missing values and the other features so the missing values will be filled with mode on the Marital_Status column.
Some features are skewed and will be better behave on the arcsinh. np.arcsinh will be used for the transformation because some features are 0 values.
df1=df.copy()
df1.drop(columns=["Avg_Open_To_Buy"], inplace=True) #drop the Avg_Open_To_Buy because it has a very high correlation with the Credit_Limit variable
df1["Credit_Limit" + '_arc'] = np.arcsinh(df1["Credit_Limit"])
df1.drop("Credit_Limit", axis=1, inplace=True)
sns.distplot(df1["Credit_Limit_arc"])
df1["Total_Amt_Chng_Q4_Q1" + '_arc'] = np.arcsinh(df1["Total_Amt_Chng_Q4_Q1"])
df1.drop("Total_Amt_Chng_Q4_Q1", axis=1, inplace=True)
sns.distplot(df1["Total_Amt_Chng_Q4_Q1_arc"])
df1["Total_Ct_Chng_Q4_Q1" + '_arc'] = np.arcsinh(df["Total_Ct_Chng_Q4_Q1"])
df1.drop("Total_Ct_Chng_Q4_Q1", axis=1, inplace=True)
sns.distplot(df1["Total_Ct_Chng_Q4_Q1_arc"])
df1["Avg_Utilization_Ratio" + '_arc'] = np.arcsinh(df["Avg_Utilization_Ratio"])
df1.drop("Avg_Utilization_Ratio", axis=1, inplace=True)
sns.distplot(df1["Avg_Utilization_Ratio_arc"])
Let's bin some numerical features which have peaks.
bins = [-np.inf, 50, 100, np.inf]
labels = ['<50', '50-100', '>100']
df1['Total_Trans_Ct_bins'] = pd.cut(df1['Total_Trans_Ct'], bins=bins, labels=labels, include_lowest=True)
sns.countplot(df1['Total_Trans_Ct_bins'])
df1['Total_Trans_Ct_bins'].value_counts()
df1.drop(columns=["Total_Trans_Ct"], inplace=True)
bins = [-np.inf, 1000, 2000, np.inf]
labels = ['<1000', '1000-2000', '>2000']
df1['Total_Revolving_Bal_bins'] = pd.cut(df1['Total_Revolving_Bal'], bins=bins, labels=labels, include_lowest=True)
df1.drop(columns=["Total_Revolving_Bal"], inplace=True)
sns.countplot(df1['Total_Revolving_Bal_bins'])
df1['Total_Revolving_Bal_bins'].value_counts()
bins = [-np.inf, 3000,6000,9000,12000,15000, 18000, np.inf]
labels = ['<3000', '3000-6000','6000-9000','9000-12000','12000-15000','15000-18000', '>18000']
df1['Total_Trans_Amt_bins'] = pd.cut(df1['Total_Trans_Amt'], bins=bins, labels=labels, include_lowest=True)
df1.drop(columns=["Total_Trans_Amt"], inplace=True)
plt.figure(figsize=(10,5))
sns.countplot(df1['Total_Trans_Amt_bins'])
df1.dtypes
df1["Income_Category"]=df1["Income_Category"].astype("category") #lets change the data type of Income_Category
df1["Attrition_Flag"].replace({"Existing Customer":0, "Attrited Customer":1}, inplace=True) # code Attrited Customer value as 1 and the other as 0
df1["Attrition_Flag"].value_counts() #recheck the values
df1.head()
quartiles = np.quantile(df1['Customer_Age'][df1['Customer_Age'].notnull()], [.25, .75])
Customer_Age_4iqr = 4 * (quartiles[1] - quartiles[0])
outliers_Customer_Age = df1.loc[np.abs(df1['Customer_Age'] - df1['Customer_Age'].median()) > Customer_Age_4iqr, 'Customer_Age']
outliers_Customer_Age #There are no outliers
quartiles = np.quantile(df1['Credit_Limit_arc'][df1['Credit_Limit_arc'].notnull()], [.25, .75])
Credit_Limit_arc_4iqr = 4 * (quartiles[1] - quartiles[0])
outliers_Credit_Limit_arc = df1.loc[np.abs(df1['Credit_Limit_arc'] - df1['Credit_Limit_arc'].median()) > Credit_Limit_arc_4iqr, 'Credit_Limit_arc']
outliers_Credit_Limit_arc #There are no outliers
quartiles = np.quantile(df1['Total_Ct_Chng_Q4_Q1_arc'][df1['Total_Ct_Chng_Q4_Q1_arc'].notnull()], [.25, .75])
Total_Ct_Chng_Q4_Q1_arc_4iqr = 4 * (quartiles[1] - quartiles[0])
outliers_Total_Ct_Chng_Q4_Q1_arc = df1.loc[np.abs(df1['Total_Ct_Chng_Q4_Q1_arc'] - df1['Total_Ct_Chng_Q4_Q1_arc'].median()) > Total_Ct_Chng_Q4_Q1_arc_4iqr, 'Total_Ct_Chng_Q4_Q1_arc']
outliers_Total_Ct_Chng_Q4_Q1_arc #There are outliers, drop the outliers
df1.drop(outliers_Total_Ct_Chng_Q4_Q1_arc.index, axis=0, inplace=True)
quartiles = np.quantile(df1['Total_Amt_Chng_Q4_Q1_arc'][df1['Total_Amt_Chng_Q4_Q1_arc'].notnull()], [.25, .75])
Total_Amt_Chng_Q4_Q1_arc_4iqr = 4 * (quartiles[1] - quartiles[0])
outliers_Total_Amt_Chng_Q4_Q1_arc = df1.loc[np.abs(df1['Total_Amt_Chng_Q4_Q1_arc'] - df1['Total_Amt_Chng_Q4_Q1_arc'].median()) > Total_Amt_Chng_Q4_Q1_arc_4iqr, 'Total_Amt_Chng_Q4_Q1_arc']
outliers_Total_Amt_Chng_Q4_Q1_arc #There are outliers, drop the outliers
df1.drop(outliers_Total_Amt_Chng_Q4_Q1_arc.index, axis=0, inplace=True)
quartiles = np.quantile(df1['Avg_Utilization_Ratio_arc'][df1['Avg_Utilization_Ratio_arc'].notnull()], [.25, .75])
Avg_Utilization_Ratio_arc_4iqr = 4 * (quartiles[1] - quartiles[0])
outliers_Avg_Utilization_Ratio_arc = df1.loc[np.abs(df1['Avg_Utilization_Ratio_arc'] - df1['Avg_Utilization_Ratio_arc'].median()) > Avg_Utilization_Ratio_arc_4iqr, 'Avg_Utilization_Ratio_arc']
outliers_Avg_Utilization_Ratio_arc #There is no outlier
quartiles = np.quantile(df1['Months_on_book'][df1['Months_on_book'].notnull()], [.25, .75])
Months_on_book_4iqr = 4 * (quartiles[1] - quartiles[0])
outliers_Months_on_book = df1.loc[np.abs(df1['Months_on_book'] - df1['Months_on_book'].median()) > Months_on_book_4iqr, 'Months_on_book']
outliers_Months_on_book #There is no outlier
df2=df1.copy()
X = df2.drop(["Attrition_Flag"], axis=1)
y = df2["Attrition_Flag"]
X_temp,X_test,y_temp,y_test=train_test_split(X,y, test_size=0.2, random_state=1, stratify=y) #split the data first temp and test and then split the temp data to train and val sets
X_train, X_val,y_train, y_val=train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)
print(X_train.shape, X_val.shape, X_test.shape)
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
cols_to_impute = ["Education_Level", "Marital_Status","Income_Category"]
# fit and transform the imputer on the train, val, and test sets
X_train[cols_to_impute] = imp_mode.fit_transform(X_train[cols_to_impute])
X_val[cols_to_impute] = imp_mode.transform(X_val[cols_to_impute])
X_test[cols_to_impute] = imp_mode.transform(X_test[cols_to_impute])
X_train = pd.get_dummies(data=X_train, drop_first=True)
X_val = pd.get_dummies(data=X_val, drop_first=True)
X_test = pd.get_dummies(data=X_test, drop_first=True)
X_train.shape
The bank could face two type losses. If the model could not detect the attrited customer: loss of money and if the model could detect the exsisting customer as attrited it is the loss of time. So, Recall score will be used as evaluation metric for the model performance because loss of money is greater than loss of time.
def classification_model_performance(model, predictors, target):
prediction = model.predict(predictors)
accuracy = accuracy_score(target, prediction)
recall = recall_score(target, prediction)
precision = precision_score(target, prediction)
f1 = f1_score(target, prediction)
# creating a dataframe of metrics
df_performance = pd.DataFrame(
{"Accuracy": accuracy, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_performance
def confusion_matrix_classification(model, predictors, target):
y_prediction = model.predict(predictors)
cm = confusion_matrix(target, y_prediction)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True")
plt.xlabel("Predicted")
lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
Log_reg_model_performance_train= classification_model_performance(lr, X_train, y_train)
Log_reg_model_performance_train
Log_reg_model_performance_val= classification_model_performance(lr, X_val, y_val)
Log_reg_model_performance_val
confusion_matrix_classification(lr, X_train, y_train)
confusion_matrix_classification(lr, X_val, y_val)
The model is not overfitting but the recall score is so low.
dtree=DecisionTreeClassifier(random_state=1)
dtree.fit(X_train,y_train)
d_tree_model_performance_train= classification_model_performance(dtree, X_train, y_train)
d_tree_model_performance_train
d_tree_model_performance_val= classification_model_performance(dtree, X_val, y_val)
d_tree_model_performance_val
confusion_matrix_classification(dtree, X_train, y_train)
confusion_matrix_classification(dtree, X_val, y_val)
The model is overfitting; the model performs well on the train set but not on the validation set.
bagging= BaggingClassifier(random_state=1)
bagging.fit(X_train,y_train)
bagging_model_performance_train= classification_model_performance(bagging, X_train, y_train)
bagging_model_performance_train
bagging_model_performance_val= classification_model_performance(bagging, X_val, y_val)
bagging_model_performance_val
confusion_matrix_classification(bagging, X_train, y_train)
confusion_matrix_classification(bagging, X_val, y_val)
The model is overfitting, the recall score is very high for train set but not validation set.
rf=RandomForestClassifier(random_state=1)
rf.fit(X_train,y_train)
random_forest_model_performance_train= classification_model_performance(rf, X_train, y_train)
random_forest_model_performance_train
random_forest_model_performance_val= classification_model_performance(rf, X_val, y_val)
random_forest_model_performance_val
confusion_matrix_classification(rf, X_train, y_train)
confusion_matrix_classification(rf, X_val, y_val)
The model is overfitting, it perform well on the train set but not on the val set, recall score is low on the val set.
ada = AdaBoostClassifier(random_state=1)
ada.fit(X_train,y_train)
ada_boosting_model_performance_train= classification_model_performance(ada, X_train, y_train)
ada_boosting_model_performance_train
ada_boosting_model_performance_val= classification_model_performance(ada, X_val, y_val)
ada_boosting_model_performance_val
confusion_matrix_classification(ada, X_train, y_train)
confusion_matrix_classification(ada, X_val, y_val)
The model is not overfit but recall scores are so low on train and val sets.
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)
gbc_model_performance_train= classification_model_performance(gbc, X_train, y_train)
gbc_model_performance_train
gbc_model_performance_val= classification_model_performance(gbc, X_val, y_val)
gbc_model_performance_val
confusion_matrix_classification(gbc, X_train, y_train)
confusion_matrix_classification(gbc, X_val, y_val)
The model is not overfitting, but the recall score on the train and val sets are low.
xgb = XGBClassifier(random_state=1, eval_metric='logloss')
xgb.fit(X_train,y_train)
xgb_model_performance_train= classification_model_performance(xgb, X_train, y_train)
xgb_model_performance_train
xgb_model_performance_val= classification_model_performance(xgb, X_val, y_val)
xgb_model_performance_val
confusion_matrix_classification(xgb, X_val, y_val)
The model perfom well on train set but not perfect on the val set. The model is overfit.
print("Before oversampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before oversampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1)
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After oversampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After oversampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After oversampling, the shape of train_X: {}".format(X_train_over.shape))
print("After oversampling, the shape of train_y: {} \n".format(y_train_over.shape))
lr_over = LogisticRegression(random_state=1)
lr_over.fit(X_train_over, y_train_over)
lr_over_performance_train= classification_model_performance(lr_over, X_train_over, y_train_over)
lr_over_performance_train
lr_over_performance_val= classification_model_performance(lr_over, X_val, y_val)
lr_over_performance_val
confusion_matrix_classification(lr_over, X_train_over, y_train_over)
confusion_matrix_classification(lr_over, X_val, y_val)
The model is overfitting
dtree_over=DecisionTreeClassifier(random_state=1)
dtree_over.fit(X_train_over, y_train_over)
dtree_over_performance_train= classification_model_performance(dtree_over, X_train_over, y_train_over)
dtree_over_performance_train
dtree_over_performance_val= classification_model_performance(dtree_over, X_val, y_val)
dtree_over_performance_val
confusion_matrix_classification(dtree_over, X_train_over, y_train_over)
confusion_matrix_classification(dtree_over, X_val, y_val)
The model is overfitting
bagging_over= BaggingClassifier(random_state=1)
bagging_over.fit(X_train_over, y_train_over)
bagging_over_performance_train= classification_model_performance(bagging_over, X_train_over, y_train_over)
bagging_over_performance_train
bagging_over_performance_val= classification_model_performance(bagging_over, X_val, y_val)
bagging_over_performance_val
confusion_matrix_classification(bagging_over, X_train_over, y_train_over)
confusion_matrix_classification(bagging_over, X_val, y_val)
The recall score is low on val set, the model is overfitting
rf_over=RandomForestClassifier(random_state=1)
rf_over.fit(X_train_over, y_train_over)
rf_over_performance_train= classification_model_performance(rf_over, X_train_over, y_train_over)
rf_over_performance_train
rf_over_performance_val= classification_model_performance(rf_over, X_val, y_val)
rf_over_performance_val
confusion_matrix_classification(rf_over, X_train_over, y_train_over)
confusion_matrix_classification(rf_over, X_val, y_val)
The model is overfitting
ada_over = AdaBoostClassifier(random_state=1)
ada_over.fit(X_train_over, y_train_over)
ada_over_performance_train= classification_model_performance(ada_over, X_train_over, y_train_over)
ada_over_performance_train
ada_over_performance_val= classification_model_performance(ada_over, X_val, y_val)
ada_over_performance_val
confusion_matrix_classification(ada_over, X_train_over, y_train_over)
confusion_matrix_classification(ada_over, X_val, y_val)
The model is overfitting
gbc_over = GradientBoostingClassifier(random_state=1)
gbc_over.fit(X_train_over, y_train_over)
gbc_over_performance_train= classification_model_performance(gbc_over, X_train_over, y_train_over)
gbc_over_performance_train
gbc_over_performance_val= classification_model_performance(gbc_over, X_val, y_val)
gbc_over_performance_val
confusion_matrix_classification(gbc_over, X_train_over, y_train_over)
confusion_matrix_classification(gbc_over, X_val, y_val)
xgb_over = XGBClassifier(random_state=1, eval_metric='logloss')
xgb_over.fit(X_train_over, y_train_over)
xgb_over_performance_train= classification_model_performance(xgb_over, X_train_over, y_train_over)
xgb_over_performance_train
xgb_over_performance_val= classification_model_performance(xgb_over, X_val, y_val)
xgb_over_performance_val
confusion_matrix_classification(xgb_over, X_train_over, y_train_over)
confusion_matrix_classification(xgb_over, X_val, y_val)
The oversampled models performed well on the train set but did not perform well on Val sets. Gradient Boosting Classifier on oversampled data is better than other models on oversampled data.
rus = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
lr_under = LogisticRegression(random_state=1)
lr_under.fit(X_train_under, y_train_under)
lr_under_performance_train= classification_model_performance(lr_under, X_train_under, y_train_under)
lr_under_performance_train
lr_under_performance_val= classification_model_performance(lr_under, X_val, y_val)
lr_under_performance_val
confusion_matrix_classification(lr_under, X_train_under, y_train_under)
confusion_matrix_classification(lr_under, X_val, y_val)
The recall scores on the train and val sets are good but not perfect, and accuracy scores on the train and val sets are low.
dtree_under=DecisionTreeClassifier(random_state=1)
dtree_under.fit(X_train_under, y_train_under)
dtree_under_performance_train= classification_model_performance(dtree_under, X_train_under, y_train_under)
dtree_under_performance_train
dtree_under_performance_val= classification_model_performance(dtree_under, X_val, y_val)
dtree_under_performance_val
confusion_matrix_classification(dtree_under, X_train_under, y_train_under)
confusion_matrix_classification(dtree_under, X_val, y_val)
The model is overfitting.
bagging_under= BaggingClassifier(random_state=1)
bagging_under.fit(X_train_under, y_train_under)
bagging_under_performance_train= classification_model_performance(bagging_under, X_train_under, y_train_under)
bagging_under_performance_train
bagging_under_performance_val= classification_model_performance(bagging_under, X_val, y_val)
bagging_under_performance_val
confusion_matrix_classification(bagging_under, X_train_under, y_train_under)
confusion_matrix_classification(bagging_under, X_val, y_val)
The recall score on the val set is low.
rf_under=RandomForestClassifier(random_state=1)
rf_under.fit(X_train_under, y_train_under)
rf_under_performance_train= classification_model_performance(rf_under, X_train_under, y_train_under)
rf_under_performance_train
rf_under_performance_val= classification_model_performance(rf_under, X_val, y_val)
rf_under_performance_val
confusion_matrix_classification(rf_under, X_train_under, y_train_under)
confusion_matrix_classification(rf_under, X_val, y_val)
The model performs well on the train set but not well on the val set.
ada_under = AdaBoostClassifier(random_state=1)
ada_under.fit(X_train_under, y_train_under)
ada_under_performance_train= classification_model_performance(ada_under, X_train_under, y_train_under)
ada_under_performance_train
ada_under_performance_val= classification_model_performance(ada_under, X_val, y_val)
ada_under_performance_val
confusion_matrix_classification(ada_under, X_train_under, y_train_under)
confusion_matrix_classification(ada_under, X_val, y_val)
The recall scores are good and the model is not overfitting but the accuracy scores are low.
gbc_under = GradientBoostingClassifier(random_state=1)
gbc_under.fit(X_train_under, y_train_under)
gbc_under_performance_train= classification_model_performance(gbc_under, X_train_under, y_train_under)
gbc_under_performance_train
gbc_under_performance_val= classification_model_performance(gbc_under, X_val, y_val)
gbc_under_performance_val
confusion_matrix_classification(gbc_under, X_train_under, y_train_under)
confusion_matrix_classification(gbc_under, X_val, y_val)
xgb_under = XGBClassifier(random_state=1, eval_metric='logloss')
xgb_under.fit(X_train_under, y_train_under)
xgb_under_performance_train= classification_model_performance(xgb_under, X_train_under, y_train_under)
xgb_under_performance_train
xgb_under_performance_val= classification_model_performance(xgb_under, X_val, y_val)
xgb_under_performance_val
confusion_matrix_classification(xgb_under, X_train_under, y_train_under)
confusion_matrix_classification(xgb_under, X_val, y_val)
models_comp = pd.concat( [
Log_reg_model_performance_train.T,
Log_reg_model_performance_val.T,
d_tree_model_performance_train.T,
d_tree_model_performance_val.T,
bagging_model_performance_train.T,
bagging_model_performance_val.T,
random_forest_model_performance_train.T,
random_forest_model_performance_val.T,
ada_boosting_model_performance_train.T,
ada_boosting_model_performance_val.T,
gbc_model_performance_train.T,
gbc_model_performance_val.T,
xgb_model_performance_train.T,
xgb_model_performance_val.T,
lr_over_performance_train.T,
lr_over_performance_val.T,
dtree_over_performance_train.T,
dtree_over_performance_val.T,
bagging_over_performance_train.T,
bagging_over_performance_val.T,
rf_over_performance_train.T,
rf_over_performance_val.T,
ada_over_performance_train.T,
ada_over_performance_val.T,
gbc_over_performance_train.T,
gbc_over_performance_val.T,
xgb_over_performance_train.T,
xgb_over_performance_val.T,
lr_under_performance_train.T,
lr_under_performance_val.T,
dtree_under_performance_train.T,
dtree_under_performance_val.T,
bagging_under_performance_train.T,
bagging_under_performance_val.T,
rf_under_performance_train.T,
rf_under_performance_val.T,
ada_under_performance_train.T,
ada_under_performance_val.T,
gbc_under_performance_train.T,
gbc_under_performance_val.T,
xgb_under_performance_train.T,
xgb_under_performance_val.T], axis=1)
models_comp.columns = [
"Logistic Regression Train",
"Logistic Regression Val",
"Decision Tree Train",
"Decision Tree Val",
"Bagging Classifier Train",
"Bagging Classifier Val",
"Random Forest Model Train",
"Random Forest Model Val",
"AdaBoost Classifier Train",
"AdaBoost Classifier Val",
"Gradient Boosting Train",
"Gradient Boosting Val",
"XGBoost Classifier Train",
"XGBoost Classifier Val",
"Logistic Regression on oversampled Train",
"Logistic Regression on oversampled Val",
"Decision Tree on oversampled Train",
"Decision Tree on oversampled Val",
"Bagging Classifier on oversampled Train",
"Bagging Classifier on oversampled Val",
"Random Forest on oversampled Train",
"Random Forest on oversampled Val",
"AdaBoost Classifier on oversampled Train",
"AdaBoost Classifier on oversampled Val",
"Gradient Boosting Classifier on oversampled Train",
"Gradient Boosting Classifier on oversampled Val",
"XGBoost Classifier on oversampled Train",
"XGBoost Classifier on oversampled Val",
"Logistic Regression on undersampled Train",
"Logistic Regression on undersampled Val",
"Decision Tree on undersampled Train",
"Decision Tree on undersampled Val",
"Bagging Classifier on undersampled Train",
"Bagging Classifier on undersampled Val",
"Random Forest on undersampled Train",
"Random Forest on undersampled Val",
"AdaBoost Classifier on undersampled Train",
"AdaBoost Classifier on undersampled Val",
"Gradient Boosting Classifier on undersampled Train",
"Gradient Boosting Classifier on undersampled Val",
"XGBoost Classifier on undersampled Train",
"XGBoost Classifier on undersampled Val"]
models_comp
Gradient Boosting Classifier on oversampled, XGBoost Classifier on undersampled, Gradient Boosting Classifier on undersampled models can tune to improve the performance of the models.
model_gbc_under = GradientBoostingClassifier(random_state=1)
parameters = {
"n_estimators": [50,100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
scorer = metrics.make_scorer(metrics.recall_score)
gbc_tuned1 =RandomizedSearchCV(estimator=model_gbc_under, param_distributions=parameters, n_iter=50, scoring=scorer, cv=5, random_state=1)
gbc_tuned1.fit(X_train_under, y_train_under)
print("Best parameters are {} with CV score={}:" .format(gbc_tuned1.best_params_,gbc_tuned1.best_score_))
gbc_under_tuned_performance_train= classification_model_performance(gbc_tuned1, X_train_under, y_train_under)
gbc_under_tuned_performance_train
gbc_under_tuned_performance_val= classification_model_performance(gbc_tuned1, X_val, y_val)
gbc_under_tuned_performance_val
model_xgb_under = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,200,50),
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned1 = RandomizedSearchCV(estimator=model_xgb_under, param_distributions=param_grid, n_iter=100, scoring=scorer, cv=10, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned1.fit(X_train_under,y_train_under)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned1.best_params_,xgb_tuned1.best_score_))
xgb_under_tuned_performance_train= classification_model_performance(xgb_tuned1, X_train_under, y_train_under)
xgb_under_tuned_performance_train
xgb_under_tuned_performance_val= classification_model_performance(xgb_tuned1, X_val, y_val)
xgb_under_tuned_performance_val
model = GradientBoostingClassifier(random_state=1)
parameters = {
"n_estimators": [50,100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
scorer = metrics.make_scorer(metrics.recall_score)
gbc_tuned =RandomizedSearchCV(estimator=model, param_distributions=parameters, n_iter=50, scoring=scorer, cv=5, random_state=1)
gbc_tuned.fit(X_train_over, y_train_over)
print("Best parameters are {} with CV score={}:" .format(gbc_tuned.best_params_,gbc_tuned.best_score_))
gbc_over_tuned_performance_train= classification_model_performance(gbc_tuned, X_train_over, y_train_over)
gbc_over_tuned_performance_train
gbc_over_tuned_performance_val= classification_model_performance(gbc_tuned, X_val, y_val)
gbc_over_tuned_performance_val
Select the Gradient Boosting Classifier on undersampled data Tuned as a final model because this model is not overfitting, has a higher recall score on the train and validation sets, and has high accuracy score. The recall score is the evaluation metric of the model, but we also look at the other metrics such as the accuracy score to decide the final model because if the recall score is so high and the accuracy score is so low that means the model does not detect the attrited and existing customers. Therefore, the Gradient Boosting Classifier on undersampled data Tuned model has high recall and high accuracy scores. Before looking at the importance of the features of the model, that let's check the model performance on the test set
gbc_under_tuned_performance_test= classification_model_performance(gbc_tuned1, X_test, y_test)
gbc_under_tuned_performance_test
top_params=gbc_tuned1.best_params_
top_params
feature_names = X_train.columns
gbm_model = GradientBoostingClassifier(random_state=1, subsample= 0.9, n_estimators = 250, max_features = 0.8)
gbm_model.fit(X_train_under,y_train_under)
importances=gbm_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Gradient Boosting Classifier on undersampled data Tuned is the best model among the other models, so this model can be used as a final model. The model also give a good performance on the test set. Total_Trans_Amt_bins_3000-6000, Total_Ct_Chng_Q4_Q1_arc, and Total_Revolving_Bal_bins_1000-2000 are the most important three features in the model.
numerical_features = ['Customer_Age', 'Months_on_book', 'Credit_Limit_arc',
'Total_Amt_Chng_Q4_Q1_arc', 'Total_Ct_Chng_Q4_Q1_arc',
'Avg_Utilization_Ratio_arc']
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
categorical_features = ['Gender','Dependent_count','Education_Level','Marital_Status','Income_Category','Card_Category','Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon','Total_Trans_Ct_bins','Total_Revolving_Bal_bins','Total_Trans_Amt_bins']
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
X = df2.drop("Attrition_Flag", axis=1)
Y = df2["Attrition_Flag"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
model = Pipeline(
steps=[
("pre", preprocessor),
(
"GBC",
GradientBoostingClassifier(
random_state = 1,
subsample = 0.9,
n_estimators = 250,
max_features = 0.8
),
),
]
)
model.fit(X_train, y_train)